Assignment 10¶

DATA602 \ Michael Ippolito

In [154]:
# core
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# ml
from sklearn import datasets as ds
from sklearn import linear_model as lm
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import train_test_split as tts

#plotly or other graphing library
import plotly.express as px
import plotly.io as pio
from matplotlib import pyplot as plt

# Set plotly renderer
pio.renderers.default = 'notebook'
In [155]:
%%HTML
<!-- Set dataframe style -->
<style>.dataframe th{
background:#3f577c; 
font-family:monospace; 
color:white; 
border:3px solid white; 
text-align:left !important;}
</style>
In [156]:
# Load datasets here once and assign to variables iris and boston

# Load iris data
iris = ds.load_iris()

# Load boston data
boston = ds.load_boston()

Q1

Data set: Iris

  • Return the first 5 rows of the data including the feature names as column headings in a DataFrame and a separate Python list containing target names
In [157]:
# Create iris dataframe
iris_X = pd.DataFrame(iris['data'])
iris_y = iris['target']
iris_X.columns = iris['feature_names']
print("Iris features:")
display(iris_X.head())
print()
print("Iris target names:")
print(iris['target_names'])
Iris features:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
Iris target names:
['setosa' 'versicolor' 'virginica']

Q2

Data set: Iris

  • Fit the Iris dataset into a kNN model with neighbors=5 and predict the category of observations passed in argument new_observations. Return back the target names of each prediction (and not their encoded values, i.e. return setosa instead of 0).
In [158]:
# Fit to k=5 nearest neighbors
knn = KNN(n_neighbors=5)
knn.fit(iris_X, iris_y)

# Predict new observations
new_observations = knn.predict(iris_X)
print("New observations:")
print(iris['target_names'][new_observations])
New observations:
['setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa'
 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa'
 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa'
 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa'
 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa'
 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa'
 'setosa' 'setosa' 'versicolor' 'versicolor' 'versicolor' 'versicolor'
 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor'
 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor'
 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor'
 'versicolor' 'virginica' 'versicolor' 'virginica' 'versicolor'
 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor'
 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'virginica'
 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor'
 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor'
 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor'
 'versicolor' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica'
 'virginica' 'versicolor' 'virginica' 'virginica' 'virginica' 'virginica'
 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica'
 'virginica' 'virginica' 'versicolor' 'virginica' 'virginica' 'virginica'
 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica'
 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica'
 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica'
 'virginica' 'virginica' 'virginica' 'virginica' 'virginica' 'virginica'
 'virginica' 'virginica' 'virginica']

Q3 15 pts

Data set: Iris

  • Split the Iris dataset into a train / test model with the split ratio between the two established by the function parameter split.

  • Fit KNN with the training data with number of neighbors equal to the function parameter neighbors

  • Generate and return back an accuracy score using the test data that was split out

In [159]:
# Set function parameters
split = 0.3
k = 5

# Split the data into train and test
(X_train, X_test, y_train, y_test) = tts(iris_X, iris_y, test_size=split, random_state=777)

# Fit the KNN model
knn = KNN(n_neighbors = k)
knn.fit(X_train, y_train)

# Generate score
knn_score = knn.score(X_test, y_test)
print('Accuracy:', knn_score)
Accuracy: 0.9777777777777777

Q4

Data set: Iris

  • Generate an overfitting / underfitting curve of kNN each of the testing and training accuracy performance scores series for a range of neighbor (k) values from 1 to 30 and plot the curves (number of neighbors is x-axis, performance score is y-axis on the chart).
In [160]:
# Set split rate for test/training data
split = 0.3

# Initialize dataframe to store accuracy scores for each model run
dfscores = pd.DataFrame(columns=['k', 'accuracy'])
display(dfscores)

# Iterate over range of k values
for k in range(1, 31):

    # Split the data into train and test
    (X_train, X_test, y_train, y_test) = tts(iris_X, iris_y, test_size=split, random_state=777)

    # Fit the KNN model
    knn = KNN(n_neighbors = k)
    knn.fit(X_train, y_train)

    # Generate score
    knn_score = knn.score(X_test, y_test)
    #print('Accuracy (k=' + str(k) + '):' + str(knn_score))
    dfnew = pd.DataFrame({'k': k, 'accuracy': knn_score}, columns=dfscores.columns, index=[k])
    dfscores = pd.concat([dfscores, dfnew], axis=0)

# Plot
fig = px.scatter(dfscores, x='k', y='accuracy', template='plotly_white')
fig.show()
k accuracy

Q5 10 pts

Data set: Boston

  • Load sklearn's Boston data into a DataFrame (only the data and feature_name as column names)

  • Load sklearn's Boston target values into a separate DataFrame

  • Return back the average of AGE, average of the target (median value of homes or MEDV), and the target as NumPy values

In [161]:
# Load boston data into dataframes
boston_X = pd.DataFrame(boston['data'], columns=boston['feature_names'])
boston_y = pd.DataFrame(boston['target'], columns=['MEDV'])

# Print select averages
print('Average age:', np.mean(boston_X['AGE']))
print('Average median home value:', np.median(boston_y['MEDV']))
print()

# Print target values
print('Target values:')
boston_y_arr = np.array(boston_y)
print(boston_y_arr)
Average age: 68.57490118577076
Average median home value: 21.2

Target values:
[[24. ]
 [21.6]
 [34.7]
 [33.4]
 [36.2]
 [28.7]
 [22.9]
 [27.1]
 [16.5]
 [18.9]
 [15. ]
 [18.9]
 [21.7]
 [20.4]
 [18.2]
 [19.9]
 [23.1]
 [17.5]
 [20.2]
 [18.2]
 [13.6]
 [19.6]
 [15.2]
 [14.5]
 [15.6]
 [13.9]
 [16.6]
 [14.8]
 [18.4]
 [21. ]
 [12.7]
 [14.5]
 [13.2]
 [13.1]
 [13.5]
 [18.9]
 [20. ]
 [21. ]
 [24.7]
 [30.8]
 [34.9]
 [26.6]
 [25.3]
 [24.7]
 [21.2]
 [19.3]
 [20. ]
 [16.6]
 [14.4]
 [19.4]
 [19.7]
 [20.5]
 [25. ]
 [23.4]
 [18.9]
 [35.4]
 [24.7]
 [31.6]
 [23.3]
 [19.6]
 [18.7]
 [16. ]
 [22.2]
 [25. ]
 [33. ]
 [23.5]
 [19.4]
 [22. ]
 [17.4]
 [20.9]
 [24.2]
 [21.7]
 [22.8]
 [23.4]
 [24.1]
 [21.4]
 [20. ]
 [20.8]
 [21.2]
 [20.3]
 [28. ]
 [23.9]
 [24.8]
 [22.9]
 [23.9]
 [26.6]
 [22.5]
 [22.2]
 [23.6]
 [28.7]
 [22.6]
 [22. ]
 [22.9]
 [25. ]
 [20.6]
 [28.4]
 [21.4]
 [38.7]
 [43.8]
 [33.2]
 [27.5]
 [26.5]
 [18.6]
 [19.3]
 [20.1]
 [19.5]
 [19.5]
 [20.4]
 [19.8]
 [19.4]
 [21.7]
 [22.8]
 [18.8]
 [18.7]
 [18.5]
 [18.3]
 [21.2]
 [19.2]
 [20.4]
 [19.3]
 [22. ]
 [20.3]
 [20.5]
 [17.3]
 [18.8]
 [21.4]
 [15.7]
 [16.2]
 [18. ]
 [14.3]
 [19.2]
 [19.6]
 [23. ]
 [18.4]
 [15.6]
 [18.1]
 [17.4]
 [17.1]
 [13.3]
 [17.8]
 [14. ]
 [14.4]
 [13.4]
 [15.6]
 [11.8]
 [13.8]
 [15.6]
 [14.6]
 [17.8]
 [15.4]
 [21.5]
 [19.6]
 [15.3]
 [19.4]
 [17. ]
 [15.6]
 [13.1]
 [41.3]
 [24.3]
 [23.3]
 [27. ]
 [50. ]
 [50. ]
 [50. ]
 [22.7]
 [25. ]
 [50. ]
 [23.8]
 [23.8]
 [22.3]
 [17.4]
 [19.1]
 [23.1]
 [23.6]
 [22.6]
 [29.4]
 [23.2]
 [24.6]
 [29.9]
 [37.2]
 [39.8]
 [36.2]
 [37.9]
 [32.5]
 [26.4]
 [29.6]
 [50. ]
 [32. ]
 [29.8]
 [34.9]
 [37. ]
 [30.5]
 [36.4]
 [31.1]
 [29.1]
 [50. ]
 [33.3]
 [30.3]
 [34.6]
 [34.9]
 [32.9]
 [24.1]
 [42.3]
 [48.5]
 [50. ]
 [22.6]
 [24.4]
 [22.5]
 [24.4]
 [20. ]
 [21.7]
 [19.3]
 [22.4]
 [28.1]
 [23.7]
 [25. ]
 [23.3]
 [28.7]
 [21.5]
 [23. ]
 [26.7]
 [21.7]
 [27.5]
 [30.1]
 [44.8]
 [50. ]
 [37.6]
 [31.6]
 [46.7]
 [31.5]
 [24.3]
 [31.7]
 [41.7]
 [48.3]
 [29. ]
 [24. ]
 [25.1]
 [31.5]
 [23.7]
 [23.3]
 [22. ]
 [20.1]
 [22.2]
 [23.7]
 [17.6]
 [18.5]
 [24.3]
 [20.5]
 [24.5]
 [26.2]
 [24.4]
 [24.8]
 [29.6]
 [42.8]
 [21.9]
 [20.9]
 [44. ]
 [50. ]
 [36. ]
 [30.1]
 [33.8]
 [43.1]
 [48.8]
 [31. ]
 [36.5]
 [22.8]
 [30.7]
 [50. ]
 [43.5]
 [20.7]
 [21.1]
 [25.2]
 [24.4]
 [35.2]
 [32.4]
 [32. ]
 [33.2]
 [33.1]
 [29.1]
 [35.1]
 [45.4]
 [35.4]
 [46. ]
 [50. ]
 [32.2]
 [22. ]
 [20.1]
 [23.2]
 [22.3]
 [24.8]
 [28.5]
 [37.3]
 [27.9]
 [23.9]
 [21.7]
 [28.6]
 [27.1]
 [20.3]
 [22.5]
 [29. ]
 [24.8]
 [22. ]
 [26.4]
 [33.1]
 [36.1]
 [28.4]
 [33.4]
 [28.2]
 [22.8]
 [20.3]
 [16.1]
 [22.1]
 [19.4]
 [21.6]
 [23.8]
 [16.2]
 [17.8]
 [19.8]
 [23.1]
 [21. ]
 [23.8]
 [23.1]
 [20.4]
 [18.5]
 [25. ]
 [24.6]
 [23. ]
 [22.2]
 [19.3]
 [22.6]
 [19.8]
 [17.1]
 [19.4]
 [22.2]
 [20.7]
 [21.1]
 [19.5]
 [18.5]
 [20.6]
 [19. ]
 [18.7]
 [32.7]
 [16.5]
 [23.9]
 [31.2]
 [17.5]
 [17.2]
 [23.1]
 [24.5]
 [26.6]
 [22.9]
 [24.1]
 [18.6]
 [30.1]
 [18.2]
 [20.6]
 [17.8]
 [21.7]
 [22.7]
 [22.6]
 [25. ]
 [19.9]
 [20.8]
 [16.8]
 [21.9]
 [27.5]
 [21.9]
 [23.1]
 [50. ]
 [50. ]
 [50. ]
 [50. ]
 [50. ]
 [13.8]
 [13.8]
 [15. ]
 [13.9]
 [13.3]
 [13.1]
 [10.2]
 [10.4]
 [10.9]
 [11.3]
 [12.3]
 [ 8.8]
 [ 7.2]
 [10.5]
 [ 7.4]
 [10.2]
 [11.5]
 [15.1]
 [23.2]
 [ 9.7]
 [13.8]
 [12.7]
 [13.1]
 [12.5]
 [ 8.5]
 [ 5. ]
 [ 6.3]
 [ 5.6]
 [ 7.2]
 [12.1]
 [ 8.3]
 [ 8.5]
 [ 5. ]
 [11.9]
 [27.9]
 [17.2]
 [27.5]
 [15. ]
 [17.2]
 [17.9]
 [16.3]
 [ 7. ]
 [ 7.2]
 [ 7.5]
 [10.4]
 [ 8.8]
 [ 8.4]
 [16.7]
 [14.2]
 [20.8]
 [13.4]
 [11.7]
 [ 8.3]
 [10.2]
 [10.9]
 [11. ]
 [ 9.5]
 [14.5]
 [14.1]
 [16.1]
 [14.3]
 [11.7]
 [13.4]
 [ 9.6]
 [ 8.7]
 [ 8.4]
 [12.8]
 [10.5]
 [17.1]
 [18.4]
 [15.4]
 [10.8]
 [11.8]
 [14.9]
 [12.6]
 [14.1]
 [13. ]
 [13.4]
 [15.2]
 [16.1]
 [17.8]
 [14.9]
 [14.1]
 [12.7]
 [13.5]
 [14.9]
 [20. ]
 [16.4]
 [17.7]
 [19.5]
 [20.2]
 [21.4]
 [19.9]
 [19. ]
 [19.1]
 [19.1]
 [20.1]
 [19.9]
 [19.6]
 [23.2]
 [29.8]
 [13.8]
 [13.3]
 [16.7]
 [12. ]
 [14.6]
 [21.4]
 [23. ]
 [23.7]
 [25. ]
 [21.8]
 [20.6]
 [21.2]
 [19.1]
 [20.6]
 [15.2]
 [ 7. ]
 [ 8.1]
 [13.6]
 [20.1]
 [21.8]
 [24.5]
 [23.1]
 [19.7]
 [18.3]
 [21.2]
 [17.5]
 [16.8]
 [22.4]
 [20.6]
 [23.9]
 [22. ]
 [11.9]]

Q6

Data set: Boston

  • In the Boston dataset, the feature PTRATIO refers to pupil teacher ratio.

  • Using a matplotlib scatter plot, plot MEDV median value of homes as y-axis and PTRATIO as x-axis

  • Return back PTRATIO as a NumPy array

In [162]:
# Plot MEDV vs PTRATIO
plt.scatter(x=boston_X['PTRATIO'], y=boston_y['MEDV'])
plt.xlabel('Pupil-teacher ratio')
plt.ylabel('Median home value ($1000s)')
plt.show()

# Print PTRATIO as numpy array
boston_X_arr = np.array(boston_X['PTRATIO'])
print(boston_X_arr)
[15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 15.2 15.2 15.2 21.
 21.  21.  21.  21.  21.  21.  21.  21.  21.  21.  21.  21.  21.  21.
 21.  21.  21.  21.  21.  21.  21.  19.2 19.2 19.2 19.2 18.3 18.3 17.9
 17.9 17.9 17.9 17.9 17.9 17.9 17.9 17.9 16.8 16.8 16.8 16.8 21.1 17.9
 17.3 15.1 19.7 19.7 19.7 19.7 19.7 19.7 18.6 16.1 16.1 18.9 18.9 18.9
 19.2 19.2 19.2 19.2 18.7 18.7 18.7 18.7 18.7 18.7 19.  19.  19.  19.
 18.5 18.5 18.5 18.5 17.8 17.8 17.8 17.8 18.2 18.2 18.2 18.  18.  18.
 18.  18.  20.9 20.9 20.9 20.9 20.9 20.9 20.9 20.9 20.9 20.9 20.9 17.8
 17.8 17.8 17.8 17.8 17.8 17.8 17.8 17.8 19.1 19.1 19.1 19.1 19.1 19.1
 19.1 21.2 21.2 21.2 21.2 21.2 21.2 21.2 21.2 21.2 21.2 21.2 21.2 21.2
 21.2 21.2 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7
 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7 14.7
 14.7 14.7 14.7 14.7 16.6 16.6 16.6 16.6 16.6 16.6 16.6 17.8 17.8 17.8
 17.8 17.8 17.8 17.8 17.8 15.2 15.2 15.2 15.2 15.2 15.2 15.6 15.6 14.4
 12.6 12.6 12.6 17.  17.  14.7 14.7 14.7 14.7 18.6 18.6 18.6 18.6 18.6
 18.6 18.6 18.6 18.6 18.6 18.6 16.4 16.4 16.4 16.4 17.4 17.4 17.4 17.4
 17.4 17.4 17.4 17.4 17.4 17.4 17.4 17.4 17.4 17.4 17.4 17.4 17.4 17.4
 16.6 16.6 16.6 16.6 16.6 16.6 19.1 19.1 19.1 19.1 19.1 19.1 19.1 19.1
 19.1 19.1 16.4 16.4 15.9 13.  13.  13.  13.  13.  13.  13.  13.  13.
 13.  13.  13.  18.6 18.6 18.6 18.6 18.6 17.6 17.6 17.6 17.6 17.6 14.9
 14.9 14.9 14.9 13.6 15.3 15.3 18.2 16.6 16.6 16.6 19.2 19.2 19.2 16.
 16.  16.  16.  16.  14.8 14.8 14.8 16.1 16.1 16.1 18.4 18.4 18.4 18.4
 18.4 18.4 18.4 18.4 18.4 18.4 18.4 18.4 18.4 18.4 18.4 18.4 19.6 19.6
 19.6 19.6 19.6 19.6 19.6 19.6 16.9 16.9 16.9 16.9 16.9 20.2 20.2 20.2
 20.2 20.2 20.2 20.2 20.2 15.5 15.9 17.6 17.6 18.8 18.8 17.9 17.  19.7
 19.7 18.3 18.3 17.  22.  22.  20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2
 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2
 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2
 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2
 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2
 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2
 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2
 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2
 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2
 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.2 20.1 20.1
 20.1 20.1 20.1 19.2 19.2 19.2 19.2 19.2 19.2 19.2 19.2 21.  21.  21.
 21.  21. ]

Q7

Data set: Boston

  • Create a regression model for MEDV / PTRATIO and display a chart showing the regression line using matplotlib
  • Use np.linspace() to generate prediction X values from min to max PTRATIO

  • Return back the regression prediction space and regression predicted values

  • Make sure to labels axes appropriately

In [163]:
# Fit linear model to data
linreg = lm.LinearRegression()
linreg.fit(boston_X_arr.reshape(-1, 1), boston_y_arr.reshape(-1, 1))

# Create prediction space x
x_test = np.linspace(np.min(boston_X_arr), np.max(boston_X_arr)).reshape(-1, 1)

# Predict y values
y_pred = linreg.predict(x_test)

# Print x and y
print("Prediction space (x):", x_test)
print()
print("Predicted values (y):", y_pred)
print()

# Plot x,y data along with regression line
plt.scatter(boston_X_arr, boston_y_arr)
plt.plot(x_test, y_pred, color='black', linewidth=3)
plt.xlabel('Pupil-teacher ratio')
plt.ylabel('Median home value ($1000s)')
plt.show()
Prediction space (x): [[12.6       ]
 [12.79183673]
 [12.98367347]
 [13.1755102 ]
 [13.36734694]
 [13.55918367]
 [13.75102041]
 [13.94285714]
 [14.13469388]
 [14.32653061]
 [14.51836735]
 [14.71020408]
 [14.90204082]
 [15.09387755]
 [15.28571429]
 [15.47755102]
 [15.66938776]
 [15.86122449]
 [16.05306122]
 [16.24489796]
 [16.43673469]
 [16.62857143]
 [16.82040816]
 [17.0122449 ]
 [17.20408163]
 [17.39591837]
 [17.5877551 ]
 [17.77959184]
 [17.97142857]
 [18.16326531]
 [18.35510204]
 [18.54693878]
 [18.73877551]
 [18.93061224]
 [19.12244898]
 [19.31428571]
 [19.50612245]
 [19.69795918]
 [19.88979592]
 [20.08163265]
 [20.27346939]
 [20.46530612]
 [20.65714286]
 [20.84897959]
 [21.04081633]
 [21.23265306]
 [21.4244898 ]
 [21.61632653]
 [21.80816327]
 [22.        ]]

Predicted values (y): [[35.16421874]
 [34.75039328]
 [34.33656781]
 [33.92274235]
 [33.50891688]
 [33.09509142]
 [32.68126595]
 [32.26744049]
 [31.85361502]
 [31.43978956]
 [31.02596409]
 [30.61213863]
 [30.19831316]
 [29.7844877 ]
 [29.37066224]
 [28.95683677]
 [28.54301131]
 [28.12918584]
 [27.71536038]
 [27.30153491]
 [26.88770945]
 [26.47388398]
 [26.06005852]
 [25.64623305]
 [25.23240759]
 [24.81858212]
 [24.40475666]
 [23.99093119]
 [23.57710573]
 [23.16328026]
 [22.7494548 ]
 [22.33562933]
 [21.92180387]
 [21.5079784 ]
 [21.09415294]
 [20.68032747]
 [20.26650201]
 [19.85267654]
 [19.43885108]
 [19.02502561]
 [18.61120015]
 [18.19737468]
 [17.78354922]
 [17.36972375]
 [16.95589829]
 [16.54207282]
 [16.12824736]
 [15.71442189]
 [15.30059643]
 [14.88677096]]